In [1]:
# imports..

from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.simplefilter('ignore')
In [2]:
# Gather Data....and Explore

boston_dataset=load_boston()
In [3]:
type(boston_dataset)
Out[3]:
sklearn.utils.Bunch
In [4]:
boston_dataset
Out[4]:
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),
 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
 'filename': 'C:\\Users\\hp\\anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
In [5]:
dir(boston_dataset)
Out[5]:
['DESCR', 'data', 'feature_names', 'filename', 'target']
In [6]:
print(boston_dataset.DESCR)
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.

In [7]:
print(boston_dataset.data)
[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]
In [8]:
print(boston_dataset.feature_names)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
In [9]:
print(boston_dataset.filename)
C:\Users\hp\anaconda3\lib\site-packages\sklearn\datasets\data\boston_house_prices.csv
In [10]:
print(boston_dataset.target)
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 30.3 34.6 34.9 32.9 24.1 42.3 48.5 50.  22.6 24.4 22.5 24.4 20.
 21.7 19.3 22.4 28.1 23.7 25.  23.3 28.7 21.5 23.  26.7 21.7 27.5 30.1
 44.8 50.  37.6 31.6 46.7 31.5 24.3 31.7 41.7 48.3 29.  24.  25.1 31.5
 23.7 23.3 22.  20.1 22.2 23.7 17.6 18.5 24.3 20.5 24.5 26.2 24.4 24.8
 29.6 42.8 21.9 20.9 44.  50.  36.  30.1 33.8 43.1 48.8 31.  36.5 22.8
 30.7 50.  43.5 20.7 21.1 25.2 24.4 35.2 32.4 32.  33.2 33.1 29.1 35.1
 45.4 35.4 46.  50.  32.2 22.  20.1 23.2 22.3 24.8 28.5 37.3 27.9 23.9
 21.7 28.6 27.1 20.3 22.5 29.  24.8 22.  26.4 33.1 36.1 28.4 33.4 28.2
 22.8 20.3 16.1 22.1 19.4 21.6 23.8 16.2 17.8 19.8 23.1 21.  23.8 23.1
 20.4 18.5 25.  24.6 23.  22.2 19.3 22.6 19.8 17.1 19.4 22.2 20.7 21.1
 19.5 18.5 20.6 19.  18.7 32.7 16.5 23.9 31.2 17.5 17.2 23.1 24.5 26.6
 22.9 24.1 18.6 30.1 18.2 20.6 17.8 21.7 22.7 22.6 25.  19.9 20.8 16.8
 21.9 27.5 21.9 23.1 50.  50.  50.  50.  50.  13.8 13.8 15.  13.9 13.3
 13.1 10.2 10.4 10.9 11.3 12.3  8.8  7.2 10.5  7.4 10.2 11.5 15.1 23.2
  9.7 13.8 12.7 13.1 12.5  8.5  5.   6.3  5.6  7.2 12.1  8.3  8.5  5.
 11.9 27.9 17.2 27.5 15.  17.2 17.9 16.3  7.   7.2  7.5 10.4  8.8  8.4
 16.7 14.2 20.8 13.4 11.7  8.3 10.2 10.9 11.   9.5 14.5 14.1 16.1 14.3
 11.7 13.4  9.6  8.7  8.4 12.8 10.5 17.1 18.4 15.4 10.8 11.8 14.9 12.6
 14.1 13.  13.4 15.2 16.1 17.8 14.9 14.1 12.7 13.5 14.9 20.  16.4 17.7
 19.5 20.2 21.4 19.9 19.  19.1 19.1 20.1 19.9 19.6 23.2 29.8 13.8 13.3
 16.7 12.  14.6 21.4 23.  23.7 25.  21.8 20.6 21.2 19.1 20.6 15.2  7.
  8.1 13.6 20.1 21.8 24.5 23.1 19.7 18.3 21.2 17.5 16.8 22.4 20.6 23.9
 22.  11.9]
In [11]:
type(boston_dataset.data)
Out[11]:
numpy.ndarray
In [12]:
boston_dataset.data.shape
Out[12]:
(506, 13)
In [13]:
data = pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
In [14]:
data
Out[14]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88

506 rows × 13 columns

In [15]:
data['PRICE'] = boston_dataset.target
In [16]:
data.head()
Out[16]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
In [17]:
data.tail()
Out[17]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88 11.9
In [18]:
data.count()
Out[18]:
CRIM       506
ZN         506
INDUS      506
CHAS       506
NOX        506
RM         506
AGE        506
DIS        506
RAD        506
TAX        506
PTRATIO    506
B          506
LSTAT      506
PRICE      506
dtype: int64
In [19]:
pd.isnull(data)
Out[19]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 False False False False False False False False False False False False False False
1 False False False False False False False False False False False False False False
2 False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False
4 False False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 False False False False False False False False False False False False False False
502 False False False False False False False False False False False False False False
503 False False False False False False False False False False False False False False
504 False False False False False False False False False False False False False False
505 False False False False False False False False False False False False False False

506 rows × 14 columns

In [20]:
pd.isnull(data).any()
Out[20]:
CRIM       False
ZN         False
INDUS      False
CHAS       False
NOX        False
RM         False
AGE        False
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT      False
PRICE      False
dtype: bool
In [21]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB
In [22]:
plt.hist(data['PRICE'])
plt.show()
In [23]:
plt.hist(data['PRICE'], bins = 30)
plt.show()
In [24]:
plt.figure(figsize= (10,6))
plt.hist(data['PRICE'], bins =50, ec = 'black', color = '#800000', alpha = 0.7)
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
In [25]:
sns.distplot(data['PRICE'])
plt.show()
In [26]:
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50)
plt.show()
In [27]:
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50, hist=False)
plt.show()
In [28]:
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50,kde=False )
plt.show()
In [29]:
plt.figure(figsize= (10,6))
plt.hist(data['RM'], ec = 'black', color = 'pink')# removed bins
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr of Houses')
plt.show()
In [30]:
data['RM'].mean()
Out[30]:
6.284634387351787
In [31]:
plt.figure(figsize= (10,6))
plt.hist(data['RAD'], ec = 'black', color = 'green')
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr of Houses')
plt.show()
In [32]:
data['RAD'].value_counts()
Out[32]:
24.0    132
5.0     115
4.0     110
3.0      38
6.0      26
8.0      24
2.0      24
1.0      20
7.0      17
Name: RAD, dtype: int64
In [33]:
plt.figure(figsize= (10,6))
plt.hist(data['RAD'], ec = 'black', bins = 24, color = 'green')
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr of Houses')
plt.show()
In [34]:
accessibility = data['RAD'].value_counts()
In [35]:
accessibility.values
Out[35]:
array([132, 115, 110,  38,  26,  24,  24,  20,  17], dtype=int64)
In [36]:
print(accessibility.index)
Float64Index([24.0, 5.0, 4.0, 3.0, 6.0, 8.0, 2.0, 1.0, 7.0], dtype='float64')
In [37]:
type(accessibility)
Out[37]:
pandas.core.series.Series
In [38]:
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility )
plt.show()
In [39]:
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility, color ='brown', ec = 'blue', width = 0.5)# changing the width, color and ec
plt.xlabel('Accessibility index to Highways')
plt.ylabel('Nr of Houses')
plt.show()
In [40]:
data['CHAS'].value_counts()
Out[40]:
0.0    471
1.0     35
Name: CHAS, dtype: int64
In [41]:
data['PRICE'].min()
Out[41]:
5.0
In [42]:
data['PRICE'].max()
Out[42]:
50.0
In [43]:
data['PRICE'].mean()
Out[43]:
22.532806324110698
In [44]:
data['PRICE'].median()
Out[44]:
21.2
In [45]:
data.min()
Out[45]:
CRIM         0.00632
ZN           0.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.38500
RM           3.56100
AGE          2.90000
DIS          1.12960
RAD          1.00000
TAX        187.00000
PTRATIO     12.60000
B            0.32000
LSTAT        1.73000
PRICE        5.00000
dtype: float64
In [46]:
data.max()
Out[46]:
CRIM        88.9762
ZN         100.0000
INDUS       27.7400
CHAS         1.0000
NOX          0.8710
RM           8.7800
AGE        100.0000
DIS         12.1265
RAD         24.0000
TAX        711.0000
PTRATIO     22.0000
B          396.9000
LSTAT       37.9700
PRICE       50.0000
dtype: float64
In [47]:
data.mean()
Out[47]:
CRIM         3.613524
ZN          11.363636
INDUS       11.136779
CHAS         0.069170
NOX          0.554695
RM           6.284634
AGE         68.574901
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
PRICE       22.532806
dtype: float64
In [48]:
data.median()
Out[48]:
CRIM         0.25651
ZN           0.00000
INDUS        9.69000
CHAS         0.00000
NOX          0.53800
RM           6.20850
AGE         77.50000
DIS          3.20745
RAD          5.00000
TAX        330.00000
PTRATIO     19.05000
B          391.44000
LSTAT       11.36000
PRICE       21.20000
dtype: float64
In [49]:
data.describe()
Out[49]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000
In [50]:
# Correlation

$$ \rho_{xy} = Corr(X,Y) $$

$$ -1.0 \leq \rho_{xy} \leq +1.0 $$

In [51]:
data['PRICE'].corr(data['RM'])
Out[51]:
0.695359947071539
In [52]:
data['PRICE'].corr(data['PTRATIO'])
Out[52]:
-0.5077866855375615
In [53]:
data.corr()
Out[53]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
CRIM 1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 -0.385064 0.455621 -0.388305
ZN -0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 0.360445
INDUS 0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 -0.483725
CHAS -0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 0.175260
NOX 0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 -0.427321
RM -0.219247 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.613808 0.695360
AGE 0.352734 -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 -0.273534 0.602339 -0.376955
DIS -0.379670 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.496996 0.249929
RAD 0.625505 -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.488676 -0.381626
TAX 0.582764 -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.543993 -0.468536
PTRATIO 0.289946 -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.374044 -0.507787
B -0.385064 0.175520 -0.356977 0.048788 -0.380051 0.128069 -0.273534 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.366087 0.333461
LSTAT 0.455621 -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 -0.366087 1.000000 -0.737663
PRICE -0.388305 0.360445 -0.483725 0.175260 -0.427321 0.695360 -0.376955 0.249929 -0.381626 -0.468536 -0.507787 0.333461 -0.737663 1.000000
In [54]:
# Heatmap
In [55]:
type(data.corr())
Out[55]:
pandas.core.frame.DataFrame
In [56]:
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws ={'size':14} )
plt.show()

mask : boolean array or DataFrame, optional If passed, data will not be shown in cells where mask is True. Cells with missing values are automatically masked.

In [57]:
masker = np.zeros_like(data.corr())
In [58]:
masker
Out[58]:
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
In [59]:
type(masker)
Out[59]:
numpy.ndarray
In [60]:
triangle_indices = np.triu_indices_from(masker)
In [61]:
triangle_indices
Out[61]:
(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  8,
         8,  8,  8,  8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
        12, 12, 13], dtype=int64),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  1,  2,  3,
         4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2,  3,  4,  5,  6,  7,  8,
         9, 10, 11, 12, 13,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  4,
         5,  6,  7,  8,  9, 10, 11, 12, 13,  5,  6,  7,  8,  9, 10, 11, 12,
        13,  6,  7,  8,  9, 10, 11, 12, 13,  7,  8,  9, 10, 11, 12, 13,  8,
         9, 10, 11, 12, 13,  9, 10, 11, 12, 13, 10, 11, 12, 13, 11, 12, 13,
        12, 13, 13], dtype=int64))
In [62]:
masker[triangle_indices] = True 
In [63]:
masker
Out[63]:
array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])
In [64]:
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws ={'size':14}, mask = masker )
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
In [65]:
nox_dis_corr = round(data['NOX'].corr(data['DIS']), 3)

plt.figure(figsize = (16, 8))

plt.title (f'Dist from Employment Centres vs Pollution , Corr ({nox_dis_corr})',
           
           fontsize = 25, color = 'green')
           
plt.scatter(x=data['DIS'], y =data['NOX'], alpha = 0.5, color = 'red', s= 80)

plt.xlabel( 'Distance from Employment Centres...DIS', fontsize =14, color = 'green' )

plt.ylabel( 'Level of Pollutants Nitric Oxide...NOX', fontsize =14, color = 'green' )
          

plt.show()
In [66]:
sns.set()
sns.set_style('dark')
sns.jointplot(x=data['DIS'], y =data['NOX'], color = 'red', joint_kws = {'alpha' : 0.5},)
plt.show()
In [67]:
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
plt.figure(figsize = (16, 8))
sns.scatterplot(x=data['DIS'], y =data['NOX'], color = 'red', alpha = 0.5) # joint plot creates  scatter plot 
plt.show()
In [68]:
sns.set() 
sns.set_style('dark')
sns.jointplot(x=data['DIS'], y =data['NOX'], kind = 'hex' ) # kind is hex, darker hexagons in regions of higher density
plt.show()
In [69]:
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
sns.jointplot(x=data['TAX'], y =data['RAD'], color = 'darkred', joint_kws = {'alpha' : 0.5} ) # jointplot gives  scatter plot 
plt.show()
In [70]:
sns.lmplot(x ='TAX', y = 'RAD', data = data)
plt.show()
In [71]:
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
# height  = 7 is optional
sns.jointplot(x=data['RM'], y =data['PRICE'], color = 'indigo', kind = 'scatter', height = 7) # jointplot gives scatter plot 
plt.show()
In [72]:
sns.lmplot(x ='RM', y = 'PRICE', data = data, height =10, aspect = 2)
plt.show()
In [73]:
sns.lmplot(x ='RM', y = 'PRICE', data = data, height =10, aspect = 2, ci = None)
plt.show()
In [74]:
rm_price_corr = round(data['RM'].corr(data['PRICE']), 3)#calculating the corelation b/n NOX and DIS, also rounding to 3 decimals
plt.figure(figsize = (16, 8))
plt.title (f'Rooms  vs Price , Corr ({rm_price_corr})',
                       fontsize = 25, color = 'green'  )# using the f string notation to print the Correlatiin value in title

plt.scatter(x=data['RM'], y =data['PRICE'], alpha = 0.5, color = 'red', s= 80)
plt.xlabel( 'RM ....No of ROOMs ', fontsize =14, color = 'green' )
plt.ylabel( 'House price...PRICE', fontsize =14, color = 'green' )
plt.show()
In [75]:
# sns.pairplot(data)
# plt.show()
In [76]:
# sns.pairplot(data, kind = 'reg')
# plt.show()

Regression Multivariable

In [77]:
prices = data['PRICE'] # Target
features = data.drop('PRICE', axis = 1 ) # Features
In [78]:
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 10)
In [79]:
len(X_train)/len(features)
Out[79]:
0.7984189723320159
In [80]:
X_train.head()
Out[80]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
50 0.08873 21.0 5.64 0.0 0.439 5.963 45.7 6.8147 4.0 243.0 16.8 395.56 13.45
367 13.52220 0.0 18.10 0.0 0.631 3.863 100.0 1.5106 24.0 666.0 20.2 131.42 13.33
34 1.61282 0.0 8.14 0.0 0.538 6.096 96.9 3.7598 4.0 307.0 21.0 248.31 20.34
78 0.05646 0.0 12.83 0.0 0.437 6.232 53.7 5.0141 5.0 398.0 18.7 386.40 12.34
172 0.13914 0.0 4.05 0.0 0.510 5.572 88.5 2.5961 5.0 296.0 16.6 396.90 14.69
In [81]:
regr = LinearRegression()
regr.fit(X_train, y_train)
Out[81]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [82]:
np.round(regr.coef_, decimals =4)
Out[82]:
array([-1.2820e-01,  6.3200e-02, -7.6000e-03,  1.9745e+00, -1.6272e+01,
        3.1085e+00,  1.6300e-02, -1.4830e+00,  3.0400e-01, -1.2100e-02,
       -8.2030e-01,  1.1400e-02, -5.8160e-01])
In [83]:
regr.intercept_
Out[83]:
36.53305138282439
In [84]:
pd.DataFrame(data = regr.coef_, index = X_train.columns, columns =['Coef'] )
Out[84]:
Coef
CRIM -0.128181
ZN 0.063198
INDUS -0.007576
CHAS 1.974515
NOX -16.271989
RM 3.108456
AGE 0.016292
DIS -1.483014
RAD 0.303988
TAX -0.012082
PTRATIO -0.820306
B 0.011419
LSTAT -0.581626

image.png

In [85]:
regr.score(X_train, y_train)
Out[85]:
0.750121534530608
In [86]:
regr.score(X_test, y_test)
Out[86]:
0.6709339839115628
In [87]:
## Normal Distribution

image.png

In [88]:
from scipy.stats import norm
In [89]:
plt.figure(figsize = (10,5))

x_axis = np.arange(-10, 10, 0.001)

# mean = 0, SD = 3

plt.plot(x_axis, norm.pdf(x_axis,0,3))

plt.show()
In [90]:
# Data Transformations
In [91]:
plt.figure(figsize= (16,8))
plt.hist(data['PRICE'], bins =50, ec = 'black', color = 'blue', alpha = 0.5)# bins 50...setting color by  hexcode
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
In [92]:
data['PRICE'].skew()
Out[92]:
1.1080984082549072
In [93]:
y_log = np.log(data['PRICE'])
In [94]:
y_log.skew()
Out[94]:
-0.33032129530987864

image.png

image.png

image.png

image.png

image.png

image.png

In [95]:
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(y_log)
plt.title(f'Log price: The skew is {round(y_log.skew(),2)}', fontsize = 25)# note the f string used to print the y_log.skew()
plt.xlabel('LOG PRICES ', fontsize =14, color = 'green' )
plt.show()
In [96]:
transformed_data = features # new data frame
transformed_data['LOG_PRICE'] = y_log # added the log price to the new data frame called transformed data
transformed_data
Out[96]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT LOG_PRICE
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 3.178054
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 3.072693
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 3.546740
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 3.508556
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 3.589059
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67 3.109061
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08 3.025291
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64 3.173878
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48 3.091042
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88 2.476538

506 rows × 14 columns

In [97]:
sns.lmplot(x='LSTAT', y='PRICE', data=data, height=10, aspect =2, 
           scatter_kws={'alpha': 0.6}, line_kws={'color':'darkred'})
plt.show()
In [98]:
sns.lmplot(x='LSTAT', y='LOG_PRICE', data=transformed_data, height =10,aspect = 2, 
           scatter_kws={'alpha': 0.6}, line_kws={'color':'blue'})
plt.show()

image.png

image.png

In [99]:
prices = np.log(data['PRICE']) # Use log prices
features = data.drop('PRICE', axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, prices, 
                                                    test_size=0.2, random_state=10)

regr = LinearRegression()
regr.fit(X_train, y_train)

print('Training data r-squared:', regr.score(X_train, y_train))
print('Test data r-squared:', regr.score(X_test, y_test))

 
pd.DataFrame(data=regr.coef_, index=X_train.columns, columns=['coef'])
Training data r-squared: 0.7930234826697583
Test data r-squared: 0.7446922306260724
Out[99]:
coef
CRIM -0.010672
ZN 0.001579
INDUS 0.002030
CHAS 0.080331
NOX -0.704068
RM 0.073404
AGE 0.000763
DIS -0.047633
RAD 0.014565
TAX -0.000645
PTRATIO -0.034795
B 0.000516
LSTAT -0.031390
In [100]:
print('Intercept', regr.intercept_)
Intercept 4.05994387177519

image.png

image.png

In [101]:
# Evaluating P values using statsmodel....and also the coefficients using the stats model
In [102]:
#------------------------------------------Comments-----------------------------------------------------------------------------
# 1. Using statsmodel.... please import ....import statsmodels.api as sm.....in the uppermost cell
# 2. Using statsmodel we can Evaluate, the p-values and coefficients
# 3. OLS stands for ORDINARY LEAST SQUARES, gives a linear regression model
# 4. We pass the y_train and X_train, into sm.OLS
# 5. We have to use sm.add_constant(X_train),only then we will be able to get the value of the constant(intercept)
# 6. We then create a pandas DataFrame, to hold the Coefficients and P-values
# 7. Deprecation warnings if any, maybe ignored or acted upon as in warnings, see import statement topmost cell
#------------------------------------------------------------------------------------------------------------------------------#

#results = sm.OLS(y_train, X_train.assign(const=1)).fit()
results = sm.OLS(y_train, sm.add_constant(X_train)).fit()# obtaining the regression coeffs including the constant(intercept).
print(results.params, '\n')# printing the values of the coefficients including the consstant(intercept) 
print(results.pvalues,'\n')# printing the p values
pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})# created DataFrame,p-values rounded to 3 decimals
# any p value more than 0.05 is not significant, look at the INDUS feature and AGE feature....
const      4.059944
CRIM      -0.010672
ZN         0.001579
INDUS      0.002030
CHAS       0.080331
NOX       -0.704068
RM         0.073404
AGE        0.000763
DIS       -0.047633
RAD        0.014565
TAX       -0.000645
PTRATIO   -0.034795
B          0.000516
LSTAT     -0.031390
dtype: float64 

const      1.188124e-52
CRIM       1.753774e-14
ZN         8.607727e-03
INDUS      4.445368e-01
CHAS       3.824015e-02
NOX        2.729624e-05
RM         1.089112e-04
AGE        2.091731e-01
DIS        1.812912e-07
RAD        3.746363e-07
TAX        5.145680e-05
PTRATIO    7.549821e-09
B          6.326839e-06
LSTAT      3.105305e-37
dtype: float64 

Out[102]:
coef p-value
const 4.059944 0.000
CRIM -0.010672 0.000
ZN 0.001579 0.009
INDUS 0.002030 0.445
CHAS 0.080331 0.038
NOX -0.704068 0.000
RM 0.073404 0.000
AGE 0.000763 0.209
DIS -0.047633 0.000
RAD 0.014565 0.000
TAX -0.000645 0.000
PTRATIO -0.034795 0.000
B 0.000516 0.000
LSTAT -0.031390 0.000
In [103]:
# Multicollinearity

image.png

$$ TAX = \alpha _0 + \alpha _1 RM + \alpha _2 NOX + ... + \alpha _{12}LSTAT $$

$$ VIF _{TAX} = \frac{1}{(1 - R _{TAX} ^ 2)} $$

image.png

image.png

In [104]:
# 1. import.....from statsmodels.stats.outliers_influence import variance_inflation_factor
# 2. exog = using X_train with added constant.and getting the values from it.
# 3. sm.add_constant(x_train) will give a DataFrame
# 4. If you want to check do check (type(sm.add_constant(X_train)))
# 4. exog has to be a ndarray
# 5. Hence we used the .values, to passs only the values from the DataFrame.
# 3. exog.idx = index of the feature that we need to test
In [105]:
variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=10)# TAX is at index 10
Out[105]:
8.508856493040817

write a for loop that prints out all the VIFs for all the features

In [106]:
X_incl_const = sm.add_constant(X_train)
X_incl_const.shape[1]
Out[106]:
14
In [107]:
vif = []

for i in range(X_incl_const.shape[1]):
    
    vif.append(variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=i))
    
print(vif)
[597.5487126763895, 1.7145250443932485, 2.3328224265597584, 3.943448822674636, 1.0788133385000578, 4.410320817897635, 1.8404053075678568, 3.3267660823099408, 4.222923410477865, 7.314299817005058, 8.508856493040817, 1.839911632651406, 1.3386713255364715, 2.812544292793034]
In [108]:
pd.DataFrame({'coef_name': X_incl_const.columns, 'vif': np.around(vif, 2)})
Out[108]:
coef_name vif
0 const 597.55
1 CRIM 1.71
2 ZN 2.33
3 INDUS 3.94
4 CHAS 1.08
5 NOX 4.41
6 RM 1.84
7 AGE 3.33
8 DIS 4.22
9 RAD 7.31
10 TAX 8.51
11 PTRATIO 1.84
12 B 1.34
13 LSTAT 2.81

Bayesian Information Criterion .....the lesser the happier

image.png

image.png

image.png

image.png

Original model with log prices and all features

In [109]:
X_incl_const = sm.add_constant(X_train)

model = sm.OLS(y_train, X_incl_const)

results_1 = model.fit()

org_coef = pd.DataFrame({'coef': results_1.params, 'p-value': round(results_1.pvalues, 3)})

print('BIC with all features and the log price is      :' , results_1.bic, '\n')

print('r-squared is with all features and log price is :' , results_1.rsquared, '\n')

print(org_coef) 
BIC with all features and the log price is      : -139.74997769478875 

r-squared is with all features and log price is : 0.7930234826697582 

             coef  p-value
const    4.059944    0.000
CRIM    -0.010672    0.000
ZN       0.001579    0.009
INDUS    0.002030    0.445
CHAS     0.080331    0.038
NOX     -0.704068    0.000
RM       0.073404    0.000
AGE      0.000763    0.209
DIS     -0.047633    0.000
RAD      0.014565    0.000
TAX     -0.000645    0.000
PTRATIO -0.034795    0.000
B        0.000516    0.000
LSTAT   -0.031390    0.000

Model without INDUS

In [110]:
X_incl_const = sm.add_constant(X_train)

X_incl_const = X_incl_const.drop(['INDUS'], axis =1)

model = sm.OLS(y_train, X_incl_const)

results_2 = model.fit()

coef_minus_indus = pd.DataFrame({'coef': results_2.params, 'p-value': round(results_2.pvalues, 3)})

print('BIC without INDUS features and the log price is      :' , results_2.bic, '\n')

print('r-squared without INDUS feature  and log price is    :' , results_2.rsquared, '\n')

print(coef_minus_indus) 
BIC without INDUS features and the log price is      : -145.14508855591163 

r-squared without INDUS feature  and log price is    : 0.7927126289415163 

             coef  p-value
const    4.056231    0.000
CRIM    -0.010721    0.000
ZN       0.001551    0.010
CHAS     0.082795    0.032
NOX     -0.673365    0.000
RM       0.071739    0.000
AGE      0.000766    0.207
DIS     -0.049394    0.000
RAD      0.014014    0.000
TAX     -0.000596    0.000
PTRATIO -0.034126    0.000
B        0.000511    0.000
LSTAT   -0.031262    0.000

Model without INDUS AND AGE

In [111]:
X_incl_const = sm.add_constant(X_train)

X_incl_const = X_incl_const.drop(['INDUS', 'AGE'], axis =1)

model = sm.OLS(y_train, X_incl_const)

results_3 = model.fit()

coef_minus_indus_age = pd.DataFrame({'coef': results_3.params, 'p-value': round(results_3.pvalues, 3)})

print('BIC without INDUS features and the log price is      :' , results_3.bic, '\n')

print('r-squared without INDUS feature  and log price is    :' , results_3.rsquared, '\n')

print(coef_minus_indus_age) 
BIC without INDUS features and the log price is      : -149.49934294224656 

r-squared without INDUS feature  and log price is    : 0.7918657661852815 

             coef  p-value
const    4.035922    0.000
CRIM    -0.010702    0.000
ZN       0.001461    0.014
CHAS     0.086449    0.025
NOX     -0.616448    0.000
RM       0.076133    0.000
DIS     -0.052692    0.000
RAD      0.013743    0.000
TAX     -0.000590    0.000
PTRATIO -0.033481    0.000
B        0.000518    0.000
LSTAT   -0.030271    0.000
In [112]:
X_incl_const
Out[112]:
const CRIM ZN CHAS NOX RM DIS RAD TAX PTRATIO B LSTAT
50 1.0 0.08873 21.0 0.0 0.439 5.963 6.8147 4.0 243.0 16.8 395.56 13.45
367 1.0 13.52220 0.0 0.0 0.631 3.863 1.5106 24.0 666.0 20.2 131.42 13.33
34 1.0 1.61282 0.0 0.0 0.538 6.096 3.7598 4.0 307.0 21.0 248.31 20.34
78 1.0 0.05646 0.0 0.0 0.437 6.232 5.0141 5.0 398.0 18.7 386.40 12.34
172 1.0 0.13914 0.0 0.0 0.510 5.572 2.5961 5.0 296.0 16.6 396.90 14.69
... ... ... ... ... ... ... ... ... ... ... ... ...
320 1.0 0.16760 0.0 0.0 0.493 6.426 4.5404 5.0 287.0 19.6 396.90 7.20
15 1.0 0.62739 0.0 0.0 0.538 5.834 4.4986 4.0 307.0 21.0 395.62 8.47
484 1.0 2.37857 0.0 0.0 0.583 5.871 3.7240 24.0 666.0 20.2 370.73 13.34
125 1.0 0.16902 0.0 0.0 0.581 5.986 1.9929 2.0 188.0 19.1 385.02 14.81
265 1.0 0.76162 20.0 0.0 0.647 5.560 1.9865 5.0 264.0 13.0 392.40 10.45

404 rows × 12 columns

In [113]:
X_incl_const = sm.add_constant(X_train)
In [114]:
X_incl_const 
Out[114]:
const CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
50 1.0 0.08873 21.0 5.64 0.0 0.439 5.963 45.7 6.8147 4.0 243.0 16.8 395.56 13.45
367 1.0 13.52220 0.0 18.10 0.0 0.631 3.863 100.0 1.5106 24.0 666.0 20.2 131.42 13.33
34 1.0 1.61282 0.0 8.14 0.0 0.538 6.096 96.9 3.7598 4.0 307.0 21.0 248.31 20.34
78 1.0 0.05646 0.0 12.83 0.0 0.437 6.232 53.7 5.0141 5.0 398.0 18.7 386.40 12.34
172 1.0 0.13914 0.0 4.05 0.0 0.510 5.572 88.5 2.5961 5.0 296.0 16.6 396.90 14.69
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
320 1.0 0.16760 0.0 7.38 0.0 0.493 6.426 52.3 4.5404 5.0 287.0 19.6 396.90 7.20
15 1.0 0.62739 0.0 8.14 0.0 0.538 5.834 56.5 4.4986 4.0 307.0 21.0 395.62 8.47
484 1.0 2.37857 0.0 18.10 0.0 0.583 5.871 41.9 3.7240 24.0 666.0 20.2 370.73 13.34
125 1.0 0.16902 0.0 25.65 0.0 0.581 5.986 88.4 1.9929 2.0 188.0 19.1 385.02 14.81
265 1.0 0.76162 20.0 3.97 0.0 0.647 5.560 62.8 1.9865 5.0 264.0 13.0 392.40 10.45

404 rows × 14 columns

In [115]:
X_incl_const.shape
Out[115]:
(404, 14)
In [116]:
X_train.shape
Out[116]:
(404, 13)

image.png

image.png

image.png

image.png

image.png

RESIDIAL PLOT OF MODEL 3 THE LOG MODEL MINUS INDUS AND AGE

In [117]:
plt.figure(figsize=(20,6))
plt.scatter(x=results_3.fittedvalues, y=results_3.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted log prices $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values of the reduced log model', fontsize=20)

plt.show()

# Mean Squared Error & R-Squared
reduced_log_mse_minus_indus_minus_age = round(results_3.mse_resid, 3)# for later use and comparision
reduced_log_rsquared_minus_indus_minus_age = round(results_3.rsquared, 3) # for later use and comparision

CREATING A CRAZY ...OMITTED VARIABLES MODEL: INDUS , AGE , LSTAT , RM, NOX, CRIM

In [118]:
X_incl_const = sm.add_constant(X_train)

X_incl_const = X_incl_const.drop(['INDUS', 'AGE','LSTAT', 'RM', 'NOX', 'CRIM' ], axis =1)

model = sm.OLS(y_train, X_incl_const)

results_4 = model.fit()

coef_minus_many_variables = pd.DataFrame({'coef': results_4.params, 'p-value': round(results_4.pvalues, 3)})

print('BIC after dropping many variables and the log price is      :' , results_4.bic, '\n')

print('r-squared after dropping many variables and log price is    :' , results_4.rsquared, '\n')

print(coef_minus_many_variables) 
BIC after dropping many variables and the log price is      : 211.6927723817349 

r-squared after dropping many variables and log price is    : 0.4599600787154918 

             coef  p-value
const    3.934826    0.000
ZN       0.003108    0.001
CHAS     0.150320    0.014
DIS     -0.004306    0.698
RAD      0.014640    0.000
TAX     -0.001400    0.000
PTRATIO -0.045291    0.000
B        0.000907    0.000

RESIDUAL PLOT: OMITTED VARIABLES MODEL 'INDUS', 'AGE','LSTAT', 'RM', 'NOX', 'CRIM'

In [119]:
plt.figure(figsize=(20,6))
plt.scatter(x=results_4.fittedvalues, y=results_4.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted log prices after omitting many variables $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values after ommitting many variables', fontsize=20)

plt.show()

# Mean Squared Error & R-Squared
ommited_var_mse = round(results_4.mse_resid, 3)# for later use and comparision
ommitted_var_rsquared = round(results_4.rsquared, 3) # for later use and comparision

GOING BACK TO THE REDUCED LOG MODEL WITHOUT INDUS AND AGE

In [120]:
resid_mean = round(results_3.resid.mean(), 3)
resid_mean
Out[120]:
-0.0
In [121]:
resid_skew = round(results_3.resid.skew(), 3)
resid_skew
Out[121]:
0.118
In [122]:
plt.figure(figsize= (20,6))
sns.distplot(results_3.resid, color='navy')
plt.title(f'Residual Dist of Log price model w/o INDUS and AGE:residuals Skew ({resid_skew}) Mean ({resid_mean})',fontsize = 20)
plt.show()
In [123]:
frames = [org_coef, coef_minus_indus, coef_minus_indus_age, coef_minus_many_variables]
pd.concat(frames, axis=1, sort = False)
Out[123]:
coef p-value coef p-value coef p-value coef p-value
const 4.059944 0.000 4.056231 0.000 4.035922 0.000 3.934826 0.000
CRIM -0.010672 0.000 -0.010721 0.000 -0.010702 0.000 NaN NaN
ZN 0.001579 0.009 0.001551 0.010 0.001461 0.014 0.003108 0.001
INDUS 0.002030 0.445 NaN NaN NaN NaN NaN NaN
CHAS 0.080331 0.038 0.082795 0.032 0.086449 0.025 0.150320 0.014
NOX -0.704068 0.000 -0.673365 0.000 -0.616448 0.000 NaN NaN
RM 0.073404 0.000 0.071739 0.000 0.076133 0.000 NaN NaN
AGE 0.000763 0.209 0.000766 0.207 NaN NaN NaN NaN
DIS -0.047633 0.000 -0.049394 0.000 -0.052692 0.000 -0.004306 0.698
RAD 0.014565 0.000 0.014014 0.000 0.013743 0.000 0.014640 0.000
TAX -0.000645 0.000 -0.000596 0.000 -0.000590 0.000 -0.001400 0.000
PTRATIO -0.034795 0.000 -0.034126 0.000 -0.033481 0.000 -0.045291 0.000
B 0.000516 0.000 0.000511 0.000 0.000518 0.000 0.000907 0.000
LSTAT -0.031390 0.000 -0.031262 0.000 -0.030271 0.000 NaN NaN
In [124]:
print(f'BIC WITH ALL:{round(results_1.bic)}:W/O INDUS {round(results_2.bic)}:W/O INDUS AND AGE {round(results_3.bic)}:W/O MANY VARIABLES {round(results_4.bic)}')
              
BIC WITH ALL:-140.0:W/O INDUS -145.0:W/O INDUS AND AGE -149.0:W/O MANY VARIABLES 212.0
In [125]:
print(f'RSQD:WITH ALL {round((results_1.rsquared),2)}: W/O INDUS {round((results_2.rsquared),2)}: W/O INDUS,AGE {round((results_3.rsquared),2)}:W/O MANY VARIABLES {round((results_4.rsquared),2)}')
RSQD:WITH ALL 0.79: W/O INDUS 0.79: W/O INDUS,AGE 0.79:W/O MANY VARIABLES 0.46
In [126]:
# Our Estimate for a house is 30000....calculate the upper bound and the lower bound using the log model_minus_indus_minus_age

sd_1 = round(np.sqrt(results_3.mse_resid),3)
sd_2 = 2*(round(np.sqrt(results_3.mse_resid),3))

print('1 s.d. in log prices is', sd_1)
print('2 s.d. in log prices is', sd_2)

upper_bound = np.log(30) + sd_2
print('The upper bound in log prices for a 95% prediction interval is ', upper_bound)
print('The upper bound in normal prices is $', np.e**upper_bound * 1000)

lower_bound = np.log(30) - sd_2
print('The lower bound in log prices for a 95% prediction interval is ', lower_bound)
print('The lower bound in normal prices is $', np.e**lower_bound * 1000)
1 s.d. in log prices is 0.187
2 s.d. in log prices is 0.374
The upper bound in log prices for a 95% prediction interval is  3.7751973816621556
The upper bound in normal prices is $ 43606.11451370558
The lower bound in log prices for a 95% prediction interval is  3.0271973816621554
The lower bound in normal prices is $ 20639.307354869376
In [127]:
data
Out[127]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88 11.9

506 rows × 14 columns

EXAMINING THE DOLLAR PRICE MODEL WITHOUT THE LOG CONVERSION

In [148]:
house_prices = data['PRICE']
In [149]:
house_features = data.drop(['PRICE'], axis =1)
In [150]:
X_train, X_test, y_train, y_test = train_test_split(house_features, house_prices, 
                                                    test_size=0.2, random_state=10)
In [151]:
X_incl_const_to_house_features = sm.add_constant(X_train)
In [152]:
model = sm.OLS(y_train, X_incl_const_to_house_features)
In [153]:
results_5 = model.fit()
In [154]:
round(results_5.params,3)
Out[154]:
const      36.533
CRIM       -0.128
ZN          0.063
INDUS      -0.008
CHAS        1.975
NOX       -16.272
RM          3.108
AGE         0.016
DIS        -1.483
RAD         0.304
TAX        -0.012
PTRATIO    -0.820
B           0.011
LSTAT      -0.582
dtype: float64
In [155]:
results_5.bic
Out[155]:
2424.9546968561976
In [156]:
results_5.rsquared
Out[156]:
0.750121534530608
In [157]:
round(results_5.pvalues,2)
Out[157]:
const      0.00
CRIM       0.00
ZN         0.00
INDUS      0.90
CHAS       0.03
NOX        0.00
RM         0.00
AGE        0.26
DIS        0.00
RAD        0.00
TAX        0.00
PTRATIO    0.00
B          0.00
LSTAT      0.00
dtype: float64
In [158]:
plt.figure(figsize=(20,6))
plt.scatter(x=results_5.fittedvalues, y=results_5.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted Original prices $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values of the original model', fontsize=20)

plt.show()

# Mean Squared Error & R-Squared
original_mse = round(results_5.mse_resid, 3)# for later use and comparision
original_rsquared = round(results_5.rsquared, 3) # for later use and comparision
In [159]:
plt.figure(figsize= (20,6))
sns.distplot(results_5.resid, color='navy')
plt.title(f'Residual Dist Dollar price model)',fontsize = 20)
plt.show()
In [160]:
print(f'Skew:({round(results_5.resid.skew(),2)}), residuals mean : ({round(results_5.resid.mean(),2)}))')
Skew:(1.46), residuals mean : (0.0))
In [ ]:
 
In [ ]:
 
In [ ]: